{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# M-Estimators for Robust Linear Modeling"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%matplotlib inline"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from statsmodels.compat import lmap\n",
    "import numpy as np\n",
    "from scipy import stats\n",
    "import matplotlib.pyplot as plt\n",
    "\n",
    "import statsmodels.api as sm"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "* An M-estimator minimizes the function \n",
    "\n",
    "$$Q(e_i, \\rho) = \\sum_i~\\rho \\left (\\frac{e_i}{s}\\right )$$\n",
    "\n",
    "where $\\rho$ is a symmetric function of the residuals \n",
    "\n",
    "* The effect of $\\rho$ is to reduce the influence of outliers\n",
    "* $s$ is an estimate of scale. \n",
    "* The robust estimates $\\hat{\\beta}$ are computed by the iteratively re-weighted least squares algorithm"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "* We have several choices available for the weighting functions to be used"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "norms = sm.robust.norms"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def plot_weights(support, weights_func, xlabels, xticks):\n",
    "    fig = plt.figure(figsize=(12, 8))\n",
    "    ax = fig.add_subplot(111)\n",
    "    ax.plot(support, weights_func(support))\n",
    "    ax.set_xticks(xticks)\n",
    "    ax.set_xticklabels(xlabels, fontsize=16)\n",
    "    ax.set_ylim(-0.1, 1.1)\n",
    "    return ax"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Andrew's Wave"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "help(norms.AndrewWave.weights)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "a = 1.339\n",
    "support = np.linspace(-np.pi * a, np.pi * a, 100)\n",
    "andrew = norms.AndrewWave(a=a)\n",
    "plot_weights(\n",
    "    support, andrew.weights, [\"$-\\pi*a$\", \"0\", \"$\\pi*a$\"], [-np.pi * a, 0, np.pi * a]\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Hampel's 17A"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "help(norms.Hampel.weights)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "c = 8\n",
    "support = np.linspace(-3 * c, 3 * c, 1000)\n",
    "hampel = norms.Hampel(a=2.0, b=4.0, c=c)\n",
    "plot_weights(support, hampel.weights, [\"3*c\", \"0\", \"3*c\"], [-3 * c, 0, 3 * c])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Huber's t"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "help(norms.HuberT.weights)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "t = 1.345\n",
    "support = np.linspace(-3 * t, 3 * t, 1000)\n",
    "huber = norms.HuberT(t=t)\n",
    "plot_weights(support, huber.weights, [\"-3*t\", \"0\", \"3*t\"], [-3 * t, 0, 3 * t])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Least Squares"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "help(norms.LeastSquares.weights)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "support = np.linspace(-3, 3, 1000)\n",
    "lst_sq = norms.LeastSquares()\n",
    "plot_weights(support, lst_sq.weights, [\"-3\", \"0\", \"3\"], [-3, 0, 3])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Ramsay's Ea"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "help(norms.RamsayE.weights)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "a = 0.3\n",
    "support = np.linspace(-3 * a, 3 * a, 1000)\n",
    "ramsay = norms.RamsayE(a=a)\n",
    "plot_weights(support, ramsay.weights, [\"-3*a\", \"0\", \"3*a\"], [-3 * a, 0, 3 * a])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Trimmed Mean"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "help(norms.TrimmedMean.weights)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "c = 2\n",
    "support = np.linspace(-3 * c, 3 * c, 1000)\n",
    "trimmed = norms.TrimmedMean(c=c)\n",
    "plot_weights(support, trimmed.weights, [\"-3*c\", \"0\", \"3*c\"], [-3 * c, 0, 3 * c])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Tukey's Biweight"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "help(norms.TukeyBiweight.weights)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "c = 4.685\n",
    "support = np.linspace(-3 * c, 3 * c, 1000)\n",
    "tukey = norms.TukeyBiweight(c=c)\n",
    "plot_weights(support, tukey.weights, [\"-3*c\", \"0\", \"3*c\"], [-3 * c, 0, 3 * c])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Scale Estimators"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "* Robust estimates of the location"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "x = np.array([1, 2, 3, 4, 500])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "* The mean is not a robust estimator of location"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "x.mean()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "* The median, on the other hand, is a robust estimator with a breakdown point of 50%"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "np.median(x)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "* Analogously for the scale\n",
    "* The standard deviation is not robust"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "x.std()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Median Absolute Deviation\n",
    "\n",
    "$$ median_i |X_i - median_j(X_j)|) $$"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Standardized Median Absolute Deviation is a consistent estimator for $\\hat{\\sigma}$\n",
    "\n",
    "$$\\hat{\\sigma}=K \\cdot MAD$$\n",
    "\n",
    "where $K$ depends on the distribution. For the normal distribution for example,\n",
    "\n",
    "$$K = \\Phi^{-1}(.75)$$"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "stats.norm.ppf(0.75)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(x)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sm.robust.scale.mad(x)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "np.array([1, 2, 3, 4, 5.0]).std()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Another robust estimator of scale is the Interquartile Range (IQR)\n",
    "\n",
    "$$\\left(\\hat{X}_{0.75} - \\hat{X}_{0.25}\\right),$$\n",
    "\n",
    "where $\\hat{X}_{p}$ is the sample p-th quantile and $K$ depends on the distribution. "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The standardized IQR, given by $K \\cdot \\text{IQR}$ for\n",
    "$$K = \\frac{1}{\\Phi^{-1}(.75) - \\Phi^{-1}(.25)} \\approx 0.74,$$\n",
    "is a consistent estimator of the standard deviation for normal data."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sm.robust.scale.iqr(x)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The IQR is less robust than the MAD in the sense that it has a lower breakdown point: it can withstand 25\\% outlying observations before being completely ruined, whereas the MAD can withstand 50\\% outlying observations. However, the IQR is better suited for asymmetric distributions."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Yet another robust estimator of scale is the $Q_n$ estimator, introduced in Rousseeuw & Croux (1993), 'Alternatives to the Median Absolute Deviation'. Then $Q_n$ estimator is given by\n",
    "$$\n",
    "Q_n = K \\left\\lbrace \\vert X_{i} - X_{j}\\vert : i<j\\right\\rbrace_{(h)}\n",
    "$$\n",
    "where $h\\approx (1/4){{n}\\choose{2}}$ and $K$ is a given constant. In words, the $Q_n$ estimator is the normalized $h$-th order statistic of the absolute differences of the data. The normalizing constant $K$ is usually chosen as 2.219144, to make the estimator consistent for the standard deviation in the case of normal data. The $Q_n$ estimator has a 50\\% breakdown point and a 82\\% asymptotic efficiency at the normal distribution, much higher than the 37\\% efficiency of the MAD."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sm.robust.scale.qn_scale(x)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "* The default for Robust Linear Models is MAD\n",
    "* another popular choice is Huber's proposal 2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "np.random.seed(12345)\n",
    "fat_tails = stats.t(6).rvs(40)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "kde = sm.nonparametric.KDEUnivariate(fat_tails)\n",
    "kde.fit()\n",
    "fig = plt.figure(figsize=(12, 8))\n",
    "ax = fig.add_subplot(111)\n",
    "ax.plot(kde.support, kde.density)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(fat_tails.mean(), fat_tails.std())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(stats.norm.fit(fat_tails))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(stats.t.fit(fat_tails, f0=6))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "huber = sm.robust.scale.Huber()\n",
    "loc, scale = huber(fat_tails)\n",
    "print(loc, scale)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sm.robust.mad(fat_tails)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sm.robust.mad(fat_tails, c=stats.t(6).ppf(0.75))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sm.robust.scale.mad(fat_tails)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Duncan's Occupational Prestige data - M-estimation for outliers"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from statsmodels.graphics.api import abline_plot\n",
    "from statsmodels.formula.api import ols, rlm"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "prestige = sm.datasets.get_rdataset(\"Duncan\", \"carData\", cache=True).data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(prestige.head(10))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "fig = plt.figure(figsize=(12, 12))\n",
    "ax1 = fig.add_subplot(211, xlabel=\"Income\", ylabel=\"Prestige\")\n",
    "ax1.scatter(prestige.income, prestige.prestige)\n",
    "xy_outlier = prestige.loc[\"minister\", [\"income\", \"prestige\"]]\n",
    "ax1.annotate(\"Minister\", xy_outlier, xy_outlier + 1, fontsize=16)\n",
    "ax2 = fig.add_subplot(212, xlabel=\"Education\", ylabel=\"Prestige\")\n",
    "ax2.scatter(prestige.education, prestige.prestige)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "ols_model = ols(\"prestige ~ income + education\", prestige).fit()\n",
    "print(ols_model.summary())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "infl = ols_model.get_influence()\n",
    "student = infl.summary_frame()[\"student_resid\"]\n",
    "print(student)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(student.loc[np.abs(student) > 2])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(infl.summary_frame().loc[\"minister\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sidak = ols_model.outlier_test(\"sidak\")\n",
    "sidak.sort_values(\"unadj_p\", inplace=True)\n",
    "print(sidak)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "fdr = ols_model.outlier_test(\"fdr_bh\")\n",
    "fdr.sort_values(\"unadj_p\", inplace=True)\n",
    "print(fdr)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "rlm_model = rlm(\"prestige ~ income + education\", prestige).fit()\n",
    "print(rlm_model.summary())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(rlm_model.weights)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Hertzprung Russell data for Star Cluster CYG 0B1 - Leverage Points"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "* Data is on the luminosity and temperature of 47 stars in the direction of Cygnus."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "dta = sm.datasets.get_rdataset(\"starsCYG\", \"robustbase\", cache=True).data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from matplotlib.patches import Ellipse\n",
    "\n",
    "fig = plt.figure(figsize=(12, 8))\n",
    "ax = fig.add_subplot(\n",
    "    111,\n",
    "    xlabel=\"log(Temp)\",\n",
    "    ylabel=\"log(Light)\",\n",
    "    title=\"Hertzsprung-Russell Diagram of Star Cluster CYG OB1\",\n",
    ")\n",
    "ax.scatter(*dta.values.T)\n",
    "# highlight outliers\n",
    "e = Ellipse((3.5, 6), 0.2, 1, alpha=0.25, color=\"r\")\n",
    "ax.add_patch(e)\n",
    "ax.annotate(\n",
    "    \"Red giants\",\n",
    "    xy=(3.6, 6),\n",
    "    xytext=(3.8, 6),\n",
    "    arrowprops=dict(facecolor=\"black\", shrink=0.05, width=2),\n",
    "    horizontalalignment=\"left\",\n",
    "    verticalalignment=\"bottom\",\n",
    "    clip_on=True,  # clip to the axes bounding box\n",
    "    fontsize=16,\n",
    ")\n",
    "# annotate these with their index\n",
    "for i, row in dta.loc[dta[\"log.Te\"] < 3.8].iterrows():\n",
    "    ax.annotate(i, row, row + 0.01, fontsize=14)\n",
    "xlim, ylim = ax.get_xlim(), ax.get_ylim()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from IPython.display import Image\n",
    "\n",
    "Image(filename=\"star_diagram.png\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "y = dta[\"log.light\"]\n",
    "X = sm.add_constant(dta[\"log.Te\"], prepend=True)\n",
    "ols_model = sm.OLS(y, X).fit()\n",
    "abline_plot(model_results=ols_model, ax=ax)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "rlm_mod = sm.RLM(y, X, sm.robust.norms.TrimmedMean(0.5)).fit()\n",
    "abline_plot(model_results=rlm_mod, ax=ax, color=\"red\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "* Why? Because M-estimators are not robust to leverage points."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "infl = ols_model.get_influence()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "h_bar = 2 * (ols_model.df_model + 1) / ols_model.nobs\n",
    "hat_diag = infl.summary_frame()[\"hat_diag\"]\n",
    "hat_diag.loc[hat_diag > h_bar]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sidak2 = ols_model.outlier_test(\"sidak\")\n",
    "sidak2.sort_values(\"unadj_p\", inplace=True)\n",
    "print(sidak2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "fdr2 = ols_model.outlier_test(\"fdr_bh\")\n",
    "fdr2.sort_values(\"unadj_p\", inplace=True)\n",
    "print(fdr2)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "* Let's delete that line"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "l = ax.lines[-1]\n",
    "l.remove()\n",
    "del l"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "weights = np.ones(len(X))\n",
    "weights[X[X[\"log.Te\"] < 3.8].index.values - 1] = 0\n",
    "wls_model = sm.WLS(y, X, weights=weights).fit()\n",
    "abline_plot(model_results=wls_model, ax=ax, color=\"green\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "* MM estimators are good for this type of problem, unfortunately, we do not yet have these yet. \n",
    "* It's being worked on, but it gives a good excuse to look at the R cell magics in the notebook."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "yy = y.values[:, None]\n",
    "xx = X[\"log.Te\"].values[:, None]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "**Note**: The R code and the results in this notebook has been converted to markdown so that R is not required to build the documents. The R results in the notebook were computed using R 3.5.1 and robustbase 0.93."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "```ipython\n",
    "%load_ext rpy2.ipython\n",
    "\n",
    "%R library(robustbase)\n",
    "%Rpush yy xx\n",
    "%R mod <- lmrob(yy ~ xx);\n",
    "%R params <- mod$coefficients;\n",
    "%Rpull params\n",
    "```"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "```ipython\n",
    "%R print(mod)\n",
    "```"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "```\n",
    "Call:\n",
    "lmrob(formula = yy ~ xx)\n",
    " \\--> method = \"MM\"\n",
    "Coefficients:\n",
    "(Intercept)           xx  \n",
    "     -4.969        2.253  \n",
    "```"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "params = [-4.969387980288108, 2.2531613477892365]  # Computed using R\n",
    "print(params[0], params[1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "abline_plot(intercept=params[0], slope=params[1], ax=ax, color=\"red\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Exercise: Breakdown points of M-estimator"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "np.random.seed(12345)\n",
    "nobs = 200\n",
    "beta_true = np.array([3, 1, 2.5, 3, -4])\n",
    "X = np.random.uniform(-20, 20, size=(nobs, len(beta_true) - 1))\n",
    "# stack a constant in front\n",
    "X = sm.add_constant(X, prepend=True)  # np.c_[np.ones(nobs), X]\n",
    "mc_iter = 500\n",
    "contaminate = 0.25  # percentage of response variables to contaminate"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "all_betas = []\n",
    "for i in range(mc_iter):\n",
    "    y = np.dot(X, beta_true) + np.random.normal(size=200)\n",
    "    random_idx = np.random.randint(0, nobs, size=int(contaminate * nobs))\n",
    "    y[random_idx] = np.random.uniform(-750, 750)\n",
    "    beta_hat = sm.RLM(y, X).fit().params\n",
    "    all_betas.append(beta_hat)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "all_betas = np.asarray(all_betas)\n",
    "se_loss = lambda x: np.linalg.norm(x, ord=2) ** 2\n",
    "se_beta = lmap(se_loss, all_betas - beta_true)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Squared error loss"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "np.array(se_beta).mean()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "all_betas.mean(0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "beta_true"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "se_loss(all_betas.mean(0) - beta_true)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
